table(ss[,Condition,by=Type]) Condition
Type cell_line metastasis primary tumour
ARMS 2 2 3 2
NBL 0 2 3 3
::: {.panel-tabset}
table(ss[,Condition,by=Type]) Condition
Type cell_line metastasis primary tumour
ARMS 2 2 3 2
NBL 0 2 3 3
dtable(ss)The added file will be used as starting point to annotate CalriomD array.
library(data.table)
anno <- data.table::fread(params$array_annotation)
names(anno) <- make.names(colnames(anno))
# anno[,rn:=Probe.Set.ID]
str(anno)Classes 'data.table' and 'data.frame': 134748 obs. of 3 variables:
$ Probe.Set.ID: chr "TC0100007786.hg.1" "TC1600011519.hg.1" "TC1600011520.hg.1" "TC0800012474.hg.1" ...
$ Gene.Symbol : chr "AGO1" "SEPT1" "SEPT1" "AGO2" ...
$ Gene.Title : chr "argonaute RISC catalytic component 1" "septin 1" "septin 1" "argonaute RISC catalytic component 2" ...
- attr(*, ".internal.selfref")=<externalptr>
The gene symbols come from different origin. Thus, we don’t know how many of them will match with the methylation known genes UCSC_RefGene_Name
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
known.genes<-IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Name
known.genes<-unique(unlist(sapply(known.genes,function(x)unlist(strsplit(x,";")))))
saveRDS(known.genes, "data/known.genes.RDS")We can check how many candidate genes we have some methylation data availabe from the EPIC array:
length(known.genes)[1] 27369
We can now check how many probes from the ClariomD panel map to those same genes:
table(anno$Gene.Symbol %in% known.genes)
FALSE TRUE
108769 25979
candidate_genes <- unique(intersect(unique(anno$Gene.Symbol),known.genes))25979 probes out of 134748 in the array will provide useful information that we can match with methylation data. That is a bit less than 20% of the array. In contrast, 88% of the genes present in the methylation array annotation are also present here.
One option would be that ClariomD has much more genes in it’s annotation than the EPIC arrays. Let’s check on that:
library(clariomdhumantranscriptcluster.db)Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: IRanges
Loading required package: S4Vectors
Attaching package: 'S4Vectors'
The following objects are masked from 'package:data.table':
first, second
The following objects are masked from 'package:base':
expand.grid, I, unname
Attaching package: 'IRanges'
The following object is masked from 'package:data.table':
shift
Loading required package: org.Hs.eg.db
select(clariomdhumantranscriptcluster.db, keys=keys(clariomdhumantranscriptcluster.db),columns = c("PROBEID","SYMBOL"))-> clariom_gene_symbol'select()' returned 1:many mapping between keys and columns
clariom_gene_symbol <- clariom_gene_symbol[!is.na(clariom_gene_symbol$SYMBOL),]
length(unique(clariom_gene_symbol$SYMBOL))[1] 26284
In summary:
su <- data.table(
# Probes in expression array:
N_probes_array=NROW(anno),
# Probes in expression array with associated gene symbols
N_probes_gene_exp=paste0(
length(unique(clariom_gene_symbol$PROBEID)),
" (",
round(length(unique(clariom_gene_symbol$PROBEID))/NROW(anno)*100),
"%)"
),
# Probes in expression array with associated gene symbols that are also associated to methylation EPIC array probes
N_probes_gene_exp_meth =paste0(
length(unique(clariom_gene_symbol[clariom_gene_symbol$SYMBOL %in% known.genes,"PROBEID"])),
" (",
round(length(unique(clariom_gene_symbol[clariom_gene_symbol$SYMBOL %in% known.genes,"PROBEID"]))/NROW(anno)*100),
"%)"
),
# List of genes associated to EPIC methylation array
N_genes_meth=length(known.genes),
# List of genesassociated to ClariomD expression array
N_genes_exp=length(unique(clariom_gene_symbol$SYMBOL)),
# List of genes associated both to methylation and expression arrrays
N_genes_exp_meth= paste0(
length(unique(intersect(clariom_gene_symbol$SYMBOL,known.genes))),
" (",
round(length(unique(intersect(clariom_gene_symbol$SYMBOL,known.genes)))/length(unique(clariom_gene_symbol$SYMBOL))*100),
"%)"
)
)
su N_probes_array N_probes_gene_exp N_probes_gene_exp_meth N_genes_meth
1: 134748 25570 (19%) 22140 (16%) 27369
N_genes_exp N_genes_exp_meth
1: 26284 22695 (86%)
The clariomD expression array contains 134748 probes from which 25570 (19%) probes are associated to 26284 different known genes (UCSC symbol). The EPIC methylation array has annotation associated to 27369 genes. In order to compare Differentially methylated genes (DMG) with differentially expressed genes (DEGs) we match the two annotation datasets by overlaping the genes (UCSC symbol) that are associated both to expression and methylation. There is a total of su$N_genes_exp_meth candidate genes that overlap which correspond to su$N_probes_gene_exp_meth probesets from the ClariomD expression array.
A possible explanation is that the Clariom D array is intended to have what Affy calls ‘deep content’, unfortunately much of which isn’t assocaited with any annotated genes. This is why of the ~139K probesets on that array, only su$N_probes_gene_exp have an associated gene (SYMBOL/UCSC_RefGene_Name). There are probably many reasons for that, but the primary reason is that Affy put a bunch of speculative content on the array which remains speculative to this day.
The MBNI group at Michigan do a re-mapping of the probesets where they just pretend Affy never did any annotating, and they take all the probes on the array, align to the genome and then throw out all the probes that don’t align to a unique genomic position. They then take the remaining probes and count up all those that align to a known gene location, and put them in new probesets that are just one probeset per gene. They take the 8.1M probes on this array and throw out 5.7M of them, and only use the remaining 29% of the probes to do that.
James W. Mac Donald https://support.bioconductor.org/p/126059/
install.packages("http://mbni.org/customcdf/25.0.0/entrezg.download/clariomdhumanhsentrezg.db_25.0.0.tar.gz")
install.packages("http://mbni.org/customcdf/25.0.0/entrezg.download/pd.clariomdhuman.hs.entrezg_25.0.0.tar.gz")Here we must start from the .CEL files to map the probes to the new probsets defined in pd.clariomdhuman.hs.entrezg
library(annotate)
library(pdInfoBuilder)
library(pd.clariomdhuman.hs.entrezg)
library(clariomdhumanhsentrezg.db)
MBNI_gene_symbol <- AnnotationDbi::select(clariomdhumanhsentrezg.db,keys=keys(clariomdhumanhsentrezg.db),columns=c("SYMBOL","PROBEID"))
su2 <- data.table(
N_probes_array=NROW(anno),
N_probes_gene_exp=paste0(
length(unique(MBNI_gene_symbol$PROBEID)),
" (",
round(length(unique(MBNI_gene_symbol$PROBEID))/NROW(anno)*100),
"%)"
),
N_probes_gene_exp_meth =paste0(
length(unique(MBNI_gene_symbol[MBNI_gene_symbol$SYMBOL %in% known.genes,"PROBEID"])),
" (",
round(length(unique(MBNI_gene_symbol[MBNI_gene_symbol$SYMBOL %in% known.genes,"PROBEID"]))/NROW(anno)*100),
"%)"
),
N_genes_meth=length(known.genes),
N_genes_exp=length(unique(MBNI_gene_symbol$SYMBOL)),
N_genes_exp_meth= paste0(
length(unique(intersect(MBNI_gene_symbol$SYMBOL,known.genes))),
" (",
round(length(unique(intersect(MBNI_gene_symbol$SYMBOL,known.genes)))/length(unique(MBNI_gene_symbol$SYMBOL))*100),
"%)"
)
)
su2 We can use the super-enhancers dataset for high levels of EWS-FLI1 protein from the paper (Tomazou2015?):
superenh_anno <- data.table::fread("https://medical-epigenomics.org/papers/tomazou2015/resultTables/Super-enhancersEWS-FLI1high.bed")namedlist <- function(...) {
nl <- list(...)
argnames <- sys.call()
n<-sapply(argnames[-1], as.character)
names(nl)<-n
return(nl)
}
library(readxl)
library(data.table)
counts<-list()
#NBL:
NBL_raw <- readxl::read_xlsx("raw/20230712_OMartinez_ARMSNBL/DEG/Norm_NBL_values.xlsx", sheet = 1)
NBL_raw <- unique(NBL_raw[,!colnames(NBL_raw)%in%c("Gene.Symbol","Gene.Title")])
NBL<-merge(anno,NBL_raw,by="Probe.Set.ID")
#ARMS:
ARMS_raw <- readxl::read_xlsx("raw/20230712_OMartinez_ARMSNBL/DEG/Norm-RMS-merged.xlsx", sheet = 1)
ARMS_raw <- ARMS_raw[,!colnames(ARMS_raw)%in%c("Gene.Symbol","Gene.Title")]
ARMS<-merge(anno,ARMS_raw,all.y=T,by="Probe.Set.ID")
counts <-namedlist(NBL,ARMS)str(counts)List of 2
$ NBL :Classes 'data.table' and 'data.frame': 134748 obs. of 34 variables:
..$ Probe.Set.ID: chr [1:134748] "TC0100006432.hg.1" "TC0100006433.hg.1" "TC0100006434.hg.1" "TC0100006435.hg.1" ...
..$ Gene.Symbol : chr [1:134748] "DDX11L1" "spopoybu" "MIR1302-10" "OR4G4P" ...
..$ Gene.Title : chr [1:134748] "DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1" "Transcript Identified by AceView" "microRNA 1302-10" "olfactory receptor, family 4, subfamily G, member 4 pseudogene" ...
..$ rn : chr [1:134748] "15568" "122807" "36412" "91829" ...
..$ NB5_1MP : num [1:134748] 4.68 6.28 5.06 3.27 3.92 ...
..$ NB5_1TP : num [1:134748] 4.92 5.88 5.17 2.82 4.11 ...
..$ NB5_3MP : num [1:134748] 4.85 6.66 5.24 3.51 4.12 ...
..$ NB5_3TP : num [1:134748] 4.83 6.21 4.99 3.37 4.27 ...
..$ NB5_5MP : num [1:134748] 4.73 5.82 4.52 3.4 4.25 ...
..$ NB5_5TP : num [1:134748] 5.1 6 5.15 3.4 3.62 ...
..$ NB5_6MP : num [1:134748] 4.64 6.66 4.93 3.14 3.81 ...
..$ NB5_6TP : num [1:134748] 4.55 6.2 4.96 2.9 3.91 ...
..$ NB5_9MP : num [1:134748] 4.12 6.09 4.9 3.34 3.67 ...
..$ NB5_9TP : num [1:134748] 4.89 6.14 4.89 3.41 3.39 ...
..$ NB16_2MP : num [1:134748] 4.8 6 5.43 3.15 4.42 ...
..$ NB16_2TP : num [1:134748] 5.02 6.19 4.99 3.42 3.87 ...
..$ NB16_3MP : num [1:134748] 4.84 6.17 4.95 3.28 3.84 ...
..$ NB16_3TP : num [1:134748] 4.48 5.94 4.83 3.68 3.77 ...
..$ NB16_6TP : num [1:134748] 5.05 6.1 5.18 3.18 4 ...
..$ NB16_6MP : num [1:134748] 4.57 6.35 5.03 3.6 3.74 ...
..$ NB16_7TP : num [1:134748] 4.91 6.17 4.98 3.29 4.04 ...
..$ NB16_8TP : num [1:134748] 4.96 6.17 5.09 2.97 4.04 ...
..$ NB16_9MP : num [1:134748] 4.68 6.12 5.09 3.82 3.93 ...
..$ NB16_9TP : num [1:134748] 4.94 6.25 4.87 3.44 3.52 ...
..$ NB5_1 : num [1:134748] 4.63 5.67 4.57 3.54 3.61 ...
..$ NB5_2 : num [1:134748] 4.69 5.78 4.94 3.31 3.73 ...
..$ NB5_3 : num [1:134748] 4.61 6.13 5.86 2.63 4.11 ...
..$ NB5_4 : num [1:134748] 5.1 5.86 5.43 3.15 3.99 ...
..$ NB5_5 : num [1:134748] 5.07 6.55 4.77 3.16 3.86 ...
..$ NB16_1 : num [1:134748] 4.35 6.31 4.85 3.21 4.45 ...
..$ NB16_2 : num [1:134748] 4.67 6.19 4.63 3.17 4.34 ...
..$ NB16_3 : num [1:134748] 4.47 5.88 4.97 3.15 3.57 ...
..$ NB16_4 : num [1:134748] 4.7 6.32 4.94 2.88 4.12 ...
..$ NB16_5 : num [1:134748] 4.3 6.19 5.12 3.18 4.05 ...
..- attr(*, ".internal.selfref")=<externalptr>
..- attr(*, "sorted")= chr "Probe.Set.ID"
$ ARMS:Classes 'data.table' and 'data.frame': 134748 obs. of 16 variables:
..$ Probe.Set.ID: chr [1:134748] "TC0100007786.hg.1" "TC0X00010643.hg.1" "TC0900011072.hg.1" "TC0200007996.hg.1" ...
..$ Gene.Symbol : chr [1:134748] "AGO1" "SEPT6" "ABCA1" "AC007881.4" ...
..$ Gene.Title : chr [1:134748] "argonaute RISC catalytic component 1" "septin 6" "ATP binding cassette subfamily A member 1" NA ...
..$ rn : chr [1:134748] "1" "10" "100" "1000" ...
..$ 1T : num [1:134748] 9.46 9.45 5.91 7.3 7.11 ...
..$ 1M : num [1:134748] 9.78 9.29 6.28 6.96 7.5 ...
..$ 2T : num [1:134748] 9.24 9.19 5.69 7.15 7.24 ...
..$ 2M : num [1:134748] 9.62 9.27 5.41 7.22 7.68 ...
..$ 3T : num [1:134748] 9.07 9.06 6.06 7 7.24 ...
..$ 3M : num [1:134748] 9.5 9.48 5.62 7.32 7.59 ...
..$ RH.1 : num [1:134748] 9.6 9.82 5.61 7.77 7.93 ...
..$ RH.2 : num [1:134748] 9.58 9.94 5.94 7.72 7.87 ...
..$ RH.3 : num [1:134748] 9.73 10.22 5.81 7.74 7.81 ...
..$ N1.1 : num [1:134748] 9.94 10 5.81 7.55 7.73 ...
..$ N1.2 : num [1:134748] 10.03 9.86 5.66 7.47 7.62 ...
..$ N1.3 : num [1:134748] 10.18 10.21 5.95 7.61 7.71 ...
..- attr(*, ".internal.selfref")=<externalptr>
..- attr(*, "index")= int(0)
.. ..- attr(*, "__Probe.Set.ID")= int [1:134748] 40938 25345 64099 125672 125669 125664 23338 67697 6624 60703 ...
In order to have the same contrasts as in the methylation analysis we create the following table:
values <- tibble::tibble( # Use all possible combinations of input settings.
group_var = "Sample_Group",
Contrasts = c(paste(c("Sample_Grouptumour_ARMS-Sample_Groupmetastasis_ARMS", #T1 vs T2 -> mice
"Sample_Groupcell_line_ARMS-Sample_Groupprimary_ARMS", #T0 vs T3 -> start end
"Sample_Groupcell_line_ARMS-Sample_Grouptumour_ARMS", #T0 vs T1 -> tumor
"Sample_Groupmetastasis_ARMS-Sample_Groupprimary_ARMS", #T2 vs T3 -> metastasi
"Sample_Groupcell_line_ARMS-(Sample_Groupprimary_ARMS+Sample_Grouptumour_ARMS+Sample_Groupmetastasis_ARMS)/3", #T0 vs T1&T2&T3
"(Sample_Groupprimary_ARMS+Sample_Groupcell_line_ARMS)/2-(Sample_Grouptumour_ARMS+Sample_Groupmetastasis_ARMS)/2" #T0&T1 vs T2&T3
),collapse = ";"
),
paste(c("Sample_Groupprimary_NBL-Sample_Grouptumour_NBL", #T0 vs T1 -> tumor
"Sample_Grouptumour_NBL-Sample_Groupmetastasis_NBL", #T1 vs T2 -> mice
"(Sample_Groupprimary_NBL+Sample_Grouptumour_NBL)/2-Sample_Groupmetastasis_NBL", #T0&T1 vs T2 -> tumor vs metastasi
"Sample_Groupprimary_NBL-(Sample_Grouptumour_NBL+Sample_Groupmetastasis_NBL)/2" #T0 vs T1&T2 -> culture vs Tissue
),collapse = ";"
),
paste(c("Sample_Grouptumour_ARMS-Sample_Grouptumour_NBL", # T1 -> tumors
"Sample_Groupmetastasis_ARMS-Sample_Groupmetastasis_NBL", #T2 -> metastasi
"Sample_Groupcell_line_ARMS-Sample_Groupprimary_NBL", # T0 CL
"(Sample_Groupcell_line_ARMS+Sample_Groupprimary_NBL)/2-(Sample_Grouptumour_ARMS+Sample_Groupmetastasis_ARMS+Sample_Grouptumour_NBL+Sample_Groupmetastasis_NBL)/4" #T0 vs T1&T2 -> culture vs Tissue
),collapse = ";"
)
),
Contrast_names=list(c("At1-At2:tissue","At0-At3:start/end","At0-At1:tumors","At2-At3:metastasis","At0-All:Cell_line","At0At1 - At2At3:TuMet"),
c("Nt0-Nt1:tumor","N1-Nt2:tissue","Nt0+Nt1-Nt2:tumor/metastasi","Nt0-All:culture/tissue"),
c("At1-Nt1:tumors","At2-Nt2:metastasi","At0-Nt0:Cultures","culture vs tissue")
)
)library(data.table)
ARMS<-counts[["ARMS"]]
ARMS_ids<-intersect(ss$Exp_SID, colnames(ARMS))
ARMS[,rn:=as.numeric(rownames(ARMS))]
anno_ids<- c("rn",colnames(anno))
setkey(ss,Exp_SID)
ARMS_exp.mat<-as.matrix(ARMS[,.SD,.SDcols=ARMS_ids])
ARMS_ss <- ss[ARMS_ids,]ARMS.model <- mod(object = ARMS_exp.mat,
group_var = values$group_var[1],
contrasts = values$Contrasts[1],
covs= NULL ,
metadata = ARMS_ss,
idcol="Exp_SID",
rename=values$Contrast_names[[1]])DEGs <- DMPextr(fit=ARMS.model,p.value = 0.05, beta_normalized = ARMS[,.SD,.SDcols=ARMS_ids],mDiff = 0.000001,ann=ARMS[,.SD,.SDcols=anno_ids],writeOut = F)DEGs$Type<-ifelse(DEGs$logFC>0,"Hyper","Hypo")
apply_filter_dmps(dmps = DEGs,p.value = seq(0.00,0.05,0.01),mDiff = seq(0,4,.5),path = "analysis/DE")
DEGs[,diffexpressed:=ifelse(abs(logFC)<2,"NONE",ifelse(logFC<0,"DOWN","UP")) ]
vp<-vulcanos(data=DEGs,x="logFC",y="adj.P.Val",col="diffexpressed",label="Gene.Symbol",labSize = 2,max.overlaps = 100)
ggsave(filename = "DE_vulcano_plots.png",plot = vp,device = "png",path = "analysis/DE",dpi = "retina" )
A previous methylation analysis was performed. You can check the results here:
dmrs<-as.data.table(readRDS("data/dmrs1_ARMS.rds"))
dmrs<-dmrs[!is.na(overlapping.genes),]
ano_cols<-c("seqnames","start","end","width","no.cpgs","HMFDR","meandiff","Contrast")
dt_list<- lapply(
1:NROW(dmrs),function(rn){ # For every position or ProbeID do:
UCSC <- # 1. generate a data.table (per row)
dmrs[rn, # 2. split UCSC gene and position
lapply( # 3. new row for every combination of gene/position
overlapping.genes, # 4. add Contrast porbeID and bval difference
function(x)unlist(strsplit(x,","))
)
,by=ano_cols]
})
dt_dmrs <- rbindlist(dt_list)
DMRs_ARMS <- dt_dmrs[,Gene:=V1][!is.na(V1),]
DMRs_ARMS$V1 <- NULL
saveRDS(DMRs_ARMS,"data/DMRs_ARMS.rds")DMRs<-DMRs_ARMS
dtable(DMRs)Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
In first place let’s see if there is any overlap between Differentially methylated genes (DMGs) and Differentially Expressed Genes (DEGs):
DEGs[,Gene:=Gene.Symbol]
dt_DEG_DMR <- merge(DMRs_ARMS,DEGs,by=c("Gene","Contrast"),all = T)
tab <- dt_DEG_DMR[,.(DEGs=sum(!is.na(Probe.Set.ID)),DMRs=sum(!is.na(start)),overlap=sum(!(is.na(start)) &!(is.na(Probe.Set.ID)) )),by=c("Contrast")]
kableExtra::kbl(tab)|>kableExtra::kable_classic_2()| Contrast | DEGs | DMRs | overlap |
|---|---|---|---|
| At0-All:Cell_line | 3201 | 6650 | 160 |
| At0-At1:tumors | 4800 | 5781 | 214 |
| At0-At3:start/end | 147 | 7494 | 8 |
| At0At1 - At2At3:TuMet | 11213 | 1517 | 123 |
| At2-At3:metastasis | 3043 | 1046 | 25 |
| At1-At2:tissue | 0 | 1 | 0 |
In the table above there is one row for each DMG associated to a DMR, so there may be more ### Venn diagrams:
library(gridExtra)
library(venn)
library(ggplot2)
library(ggpolypath)
library(grid)
dt_venn <- dt_DEG_DMR[!is.na(Gene.Symbol)&!is.na(start),Gene,by=Contrast]
genelist <- unique(dt_venn$Gene)
gl <- as.data.frame(lapply(unique(dt_venn$Contrast),function(x)genelist %in% dt_venn[Contrast==x,Gene]))
names(gl)<-unique(dt_venn$Contrast)
ARMS_venn.plot <- venn::venn(gl,ilabels=TRUE, zcolor = "style", ggplot=T, box=F)
# ARMS_euler.plot <- plot(euler(gl, shape = 'ellipse'),quantities=T)
# ARMS_euler.plot
ARMS_venn <- grid.arrange(ARMS_venn.plot,
top=grid::textGrob("ARMS: Differentially Expressed & Differentially Methylated Genes",gp=grid::gpar(fontsize=14,font=3))
)
ggsave(filename = "ARMS_VENN.png",plot = ARMS_venn,device = "png",path = "analysis/DE",dpi = "retina" )
Now we want to get the mean beta value for each sample in the dmr location: 1. Find all cpg that fall in the dmr 2. get the mean for each sample 3. Match each sample bVal with it’s expression value. (No way to do it, since there is no key sample to sample) 4. Make correlation 5. Plot.
ol <- dt_DEG_DMR[!is.na(Gene.Symbol)&!is.na(start),]library(data.table)
library(GenomicRanges)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
betas <-as.data.table(readRDS("data/betas_ARMS.rds"),keep.rownames = "ProbeID")
locs <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Locations
locs.ranges <- GRanges(locs$chr, IRanges(locs$pos, locs$pos))
names(locs.ranges) <- rownames(locs)
# locs.ranges$gene_element <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Group
# locs.ranges$gene <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Name
mcols(locs.ranges)<-IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other
dt_DEG_DMR_intersect<-lapply(1:NROW(ol),function(nr){
cont<-ol[nr,Contrast]
xpr <- counts[["ARMS"]][Probe.Set.ID==ol[nr,Probe.Set.ID]]
xpr<-xpr[,.SD,.SDcols=names(xpr)[!(names(xpr)%in%c("rn",names(anno)))]]
expression <- suppressWarnings(melt(xpr,measure.vars=colnames(xpr),variable.name = "SID",value.name = "TPM" ))
SIDs <- ss[Exp_SID %in% as.character(expression$SID),barcode]
cgs <- names(subsetByOverlaps(locs.ranges,GRanges(ol[nr,])))
#### Annotate DMRs with genomic location:
ann<-as.data.frame(mcols( subsetByOverlaps(locs.ranges,GRanges(ol[nr,]))))|>as.data.table(keep.rownames="ProbeID")
# Possibility of multiple genes and genomic location per probe
genomic<-lapply(1:NROW(ann),
function(cg){
# For every position or ProbeID do:
# 1. generate a data.table (per row)
UCSC_genes <-ann[cg, # 2. split UCSC gene and position
lapply( # 3. new row for every combination of gene/position
.SD, # 4. add Contrast porbeID and bval difference
function(x)unlist(strsplit(x,";"))
)
,.SDcols=c("UCSC_RefGene_Group","UCSC_RefGene_Name")]
enhancers<-ann[cg,.SD,.SDcols=unlist(sapply(c("Enhancer","OpenChromatin","TFBS"),function(n) colnames(ann)[colnames(ann) %like% n]))]
return(cbind(UCSC_genes,enhancers))
}
)
# 5. rbind all together:
genomic_loc <- rbindlist(genomic)|> unique() # should get gene annotation from here to be fair
ge <- factor(genomic_loc$UCSC_RefGene_Group,levels = c("3'UTR","ExonBnd", "1stExon","5'UTR","Body","TSS200","TSS1500"),ordered=T )
gene_location <- ifelse(startsWith( as.character(max(ge)),"TSS" ),"promoter","intragenic")
annot <- genomic_loc[,lapply(.SD,function(x)paste(unique(x[nchar(x)>0]),collapse = ";"))]
annot$gene_location <- gene_location
####
b<-betas[ProbeID %in% cgs ,lapply(.SD,function(x)mean(x,na.rm=T)),.SDcols=SIDs]
meth <- suppressWarnings( melt(b,measure.vars=colnames(b),variable.name = "barcode",value.name = "DMR.meth" ))
meth$SID <- ss[meth,Exp_SID,on="barcode"]
# list(meth,expression)
m <- merge(meth,expression,by="SID")
m <- m[,lapply(.SD,as.numeric),by=SID]
dt<-cbind(ol[nr,],m)
cor<-cor.test(dt$DMR.meth,dt$TPM)
dt$correlation <- cor$estimate
dt$cor.pval <- cor$p.value
dt <- cbind(dt,annot)
dt$barcode=nr
return(dt)
})|>rbindlist()dt_cor <-dt_DEG_DMR_intersect
dt_cor$Condition <-ss[dt_DEG_DMR_intersect,"Condition",on="SID"]
dt_cor$Contrast <- as.factor(dt_cor$Contrast)
dt_cor$SID <- as.factor(dt_cor$SID)
reg_element_names<-unlist(sapply(c("Enhancer","OpenChromatin","TFBS"),function(n) colnames(dt_cor)[colnames(dt_cor) %like% n]))
reg <- dt_cor[,sapply(.SD,nchar)>0,.SDcols=reg_element_names ]
dt_cor$regulatory <- apply(reg,1,function(x)paste(colnames(reg)[x],collapse=";"))
setorder(dt_cor,cor.pval,Gene)
saveRDS(dt_cor,"data/dt_cor.rds")reg_element_names<-unlist(sapply(c("Enhancer","OpenChromatin","TFBS"),function(n) colnames(dt_cor)[colnames(dt_cor) %like% n]))In the following table the values for methylation and expression are integrated:
dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","SID","DMR.meth","TPM","correlation","cor.pval")])library(ggplot2)
library(data.table)
library(ggpubr)
library(ggrepel)
DMR_DEG_plots <- list()
plots_folder <- "analysis/Integration_with_expression/DEGvsDMR/pval"
dir.create(plots_folder,recursive = T,showWarnings = F)
setkeyv(dt_cor,c("barcode"))
for(cont in unique(dt_cor$Contrast)){
ID<-dt_cor[Contrast==cont,head(unique(barcode),10)]
plist <- lapply(ID,function(nr){
pdata <- dt_cor[barcode==nr,]
plot_name <- paste0("/DEG_DMR.pval_", unlist(unique(dt_cor[barcode==nr,Gene]))[1], ".png")
plt_path <- paste(plots_folder, cont, sep = .Platform$file.sep)
dir.create(plt_path,recursive = T,showWarnings = F)
p<-ggplot(data=pdata, aes(x=as.numeric(TPM),y=as.numeric(DMR.meth),colour=SID)) +
geom_point(aes(shape=Condition),size=8,alpha=0.6) +
xlab("Mean Gene methylation (DMR)") + ylab("Mean Gene Expression (Probeset)") +
geom_smooth(method = "lm",
inherit.aes = F,
aes(x=as.numeric(pdata$TPM),y=as.numeric(pdata$DMR.meth)),
linetype="dashed",
se=F,
formula = "y ~ x")+
annotate(geom = "text",
x = -Inf, y = Inf,
label=paste0("correlation: ", round(unique(pdata$correlation),4), ", p-value: ", round(unique(pdata$cor.pval),8)),
hjust = 0, vjust = 1
) +
ggtitle(paste0(" Correlation between expression and methylation for gene: ",unique(pdata$Gene)))
ggsave(path = plt_path, filename = plot_name, plot=p)
return(p)
})
DMR_DEG_plots[[cont]] <- plist
}
saveRDS(DMR_DEG_plots,"data/DMR_DEG_plots.rds")# FUN <- function(cont) cat("\n\n#### Top Correlation plots for contrast ", cont, "\n
# ```{r}
# #| column: screen-inset-right
# #| layout-ncol: 4
# #| results: hide
# #| fig-height: 4
# #| fig-keep: all
# print(DMR_DEG_plots[[",cont,"]][1:10])
# ```\n")
#
# invisible(sapply(values$Contrast_names[[1]],FUN))
DMR_DEG_plotsThe pathways analysis will be performed analysing 2 sets of genes: 1. Full DMR/DMPs: the full set of significant diferentially methylated & expressed genes. 2. Cor.sig: A subset of the above where the correlation p.value is smaller than 0.05
Although the relationship between DNA methylation and gene expression is complex and the mechanisms involved in gene regulation by DNA methylation are diverse. Fortunately, over the years researchers have found some common relationships between gene methylation and gene expression:
DNA methylation is an epigenetic mark that has been traditionally associated with gene silencing, specially when methylation happens on promoter regions. DNA methylation is related with the repressed chromatin state which blocks the access of transcription factors to promoter regions. Thus, silencing promoter activity in a stable way and reducing transcription Suzuki and Bird (2008).
In the other hand, the bodies of active genes in plants and animals are often heavily methylated. Suzuki and Bird (2008)
Therefore, usually it is expected to find most of the genes to have negative correlations between DNA methylation and gene transcription in promoter regions while some of the genes might present positive correlations specially in intra-genic regions.
Now, let’s inspect our data, the following table shows a summary of the paired DMGs and DEGs:
tab.full <- dt_cor[,.(
negative.cor=sum(unique(correlation)<0),
positive.cor=sum(unique(correlation)>0),
Regulatory.elements=sum(nchar(unique(regulatory))>0),
Total=sum(unique(correlation)!=0)),
by=c("Contrast","barcode")]
tab_summary <-tab.full[,lapply(.SD,sum),by=Contrast,.SDcols=c("Regulatory.elements","negative.cor","positive.cor","Total")]
kableExtra::kbl(tab_summary)|>kableExtra::kable_classic_2()| Contrast | Regulatory.elements | negative.cor | positive.cor | Total |
|---|---|---|---|---|
| At0-All:Cell_line | 205 | 100 | 157 | 257 |
| At0-At1:tumors | 326 | 156 | 243 | 399 |
| At0-At3:start/end | 24 | 20 | 12 | 32 |
| At0At1 - At2At3:TuMet | 130 | 57 | 115 | 172 |
| At2-At3:metastasis | 64 | 52 | 41 | 93 |
Here you can see a summary table of the Total ammount of genes that are both Differentially Methylated and Differentially Expressed. For this to yield true, there has to be at least one DMR, composed by at least 3 probes (cpg) from the methylation microarray that fall in the same region of the genome as well as a significant difference in expression associated to the same gene: - Regulatory.elements: Genes that have a DMR associated to any of the following regulatory elements: r paste(reg_element_names, collapse=", ") - negative.cor: Total number of negatively correlated genes where methylation and expression work in opposite directions. - positive.cor: Total number of positively correlated genes where methylation and expression go in the same direction. - Total ammount of genes that are both DE and DM.
dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","no.cpgs","width","SID","DMR.meth","TPM","correlation","cor.pval", "gene_location",reg_element_names)])Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
This dataset is not filtered by the strength of the correlation between DMRs and DEGs.
path_results<-function(pathway,topN=50,method="method",pval=0.05,path="results/pathways.csv",cols=NULL){
# pathway<-pathway[FDR<1,]
pathway$method<-pathway[[method]]
data.table::setorder(pathway,method,FDR)
sig_idx <- pathway[,.I[FDR < pval] ,by=method]$V1
head_idx<-pathway[,.I[1:min(..topN,.N)],by=c(method,"Contrast")]$V1
res<-pathway[base::union(sig_idx,head_idx),]
# res[,TERM:=ifelse(FDR<pval,paste0("*** ",TERM," ***"),TERM)]
# data.table::fwrite(res,path)
results<-res[,.SD,.SDcols=c("Contrast","FDR",cols,"TERM","method")]
data.table::setorder(results,Contrast,method,FDR)
return(results)
}
library(gprofiler2)
pathways_full <- lapply(unique(dt_cor$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_cor[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full.rds")dtable(Pathway_Full)As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:
dt_pos <- dt_cor[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive.rds")dtable(Pathway_Full_Pos)dt_neg <- dt_cor[correlation<0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative.rds")dtable(Pathway_Full_neg)This dataset is not filtered by the strength of the correlation between DMRs and DEGs.
dt.sig <- dt_cor[cor.pval<0.05 ,.(correlation=unique(correlation)),by=c("Contrast","Gene","width")]
library(gprofiler2)
pathways_full <- lapply(unique(dt.sig$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt.sig[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full_sig.rds")dtable(Pathway_Full)As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:
dt_pos <- dt.sig[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive_sig.rds")dtable(Pathway_Full_Pos)dt_neg <- dt.sig[correlation<0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative_sig.rds")dtable(Pathway_Full_neg)–>
–>
–> –> –> –> –>
–>
–>
–>
–>
–> –> –> –> –>
–> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –>
–>
–>
–> –> –> –> –> –> –> –> –> –> –> –> –> –> –>
–>
–> –> –> –>
–>
–> –> –>